For this project, I wanted to map the number of police use of force cases across county jurisdictions for the state of California. I will be using data obtained from the California Department of Justice’s (DOJ) Use of Force dataset, obtained from https://openjustice.doj.ca.gov/data. The California DOJ provides open access data on the criminal justice system.
The Use of Force dataset incluces data from 2016-2018 of incidents that resulted in serious bodily injury or death or involved the discharge of a firearm, as reported by Law Enforcement Agencies and other entities throughout the state that employ peace officers.
library(readr)
library(ggplot2)
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(maps)
library(mapdata)
library(tidyverse)
## ── Attaching packages ──────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.0 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ purrr 0.3.4 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::map() masks maps::map()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
##
## intersect, setdiff, union
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(viridis)
## Loading required package: viridisLite
library(flexdashboard)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
##
## wind
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(knitr)
library(hms)
##
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
##
## hms
I began by importing the data from the California DOJ website.
library(readr)
Incident_2018 <- read_csv(url("https://data-openjustice.doj.ca.gov/sites/default/files/dataset/2019-07/URSUS_Incident_2018.csv"))
## Parsed with column specification:
## cols(
## INCIDENT_ID = col_character(),
## ORI = col_character(),
## INCIDENT_DATE_STR = col_character(),
## INCIDENT_TIME_STR = col_time(format = ""),
## CITY = col_character(),
## COUNTY = col_character(),
## STATE = col_character(),
## ZIP_CODE = col_double(),
## MULTIPLE_LOCATIONS = col_logical(),
## ON_K12_CAMPUS = col_logical(),
## ARREST_MADE = col_logical(),
## CRIME_REPORT_FILED = col_logical(),
## CONTACT_REASON = col_character(),
## IN_CUSTODY_REASON = col_character(),
## NUM_INVOLVED_CIVILIANS = col_double(),
## NUM_INVOLVED_OFFICERS = col_double(),
## NUM_INVOLVED_AGENCIES = col_double()
## )
Incident_2017 <- read_csv(url("https://data-openjustice.doj.ca.gov/sites/default/files/dataset/2018-08/URSUS_Incident_2017.csv"))
## Parsed with column specification:
## cols(
## INCIDENT_ID = col_character(),
## ORI = col_character(),
## INCIDENT_DATE_STR = col_character(),
## INCIDENT_TIME_STR = col_time(format = ""),
## CITY = col_character(),
## COUNTY = col_character(),
## STATE = col_character(),
## ZIP_CODE = col_double(),
## MULTIPLE_LOCATIONS = col_logical(),
## ON_K12_CAMPUS = col_logical(),
## ARREST_MADE = col_logical(),
## CRIME_REPORT_FILED = col_logical(),
## CONTACT_REASON = col_character(),
## IN_CUSTODY_REASON = col_character(),
## NUM_INVOLVED_CIVILIANS = col_double(),
## NUM_INVOLVED_OFFICERS = col_double(),
## PRIMARY_AGENCY_INDICATOR = col_character()
## )
Incident_2016 <- read_csv(url("https://data-openjustice.doj.ca.gov/sites/default/files/dataset/2018-08/URSUS_Incident_2016.csv"))
## Parsed with column specification:
## cols(
## Incident_ID = col_character(),
## ORI = col_character(),
## Incident_Date_Str = col_character(),
## Incident_Time_Str = col_double(),
## City = col_character(),
## County = col_character(),
## State = col_character(),
## Zip_Code = col_double(),
## Multiple_Locations = col_logical(),
## On_K12_Campus = col_logical(),
## Arrest_Made = col_logical(),
## Crime_Report_Filed = col_logical(),
## Contact_Reason = col_character(),
## In_Custody_Reason = col_character(),
## Num_Involved_Civilians = col_double(),
## Num_Involved_Officers = col_double()
## )
The data consists of several flat files including information on each individual incident, each person - officer and civilian - invovled in the incident and each agency involved in the incident. These flat files are related along the key, Incident_ID, which is the identifier used to represent each unique incident or case, or the key, ORI, which is the number of the reporting agency.
view(Incident_2016)
view(Incident_2017)
view(Incident_2018)
To bind the data, I used the row_bind function in the dplyr package. To merge the 2016, 2017 and 2018 incident datasets, I needed to first ensure that all the variable names and variable types were identical. This required cleaning up the 2016 dataset by renaming variables and converting the INCIDENT_TIME_STR variable to a time variable, to match with the 2017 and 2018 datasets.
Incident_2016 <- rename(Incident_2016,
"INCIDENT_ID" = "Incident_ID",
"ORI" = "ORI",
"INCIDENT_DATE_STR" = "Incident_Date_Str",
"INCIDENT_TIME_STR" = "Incident_Time_Str",
"CITY" = "City",
"COUNTY" = "County",
"STATE" = "State",
"ZIP CODE" = "Zip_Code",
"MULTIPLE_LOCATIONS" = "Multiple_Locations",
"ON_K12_CAMPUS" = "On_K12_Campus",
"ARREST_MADE" = "Arrest_Made",
"CRIME_REPORT_FILED" = "Crime_Report_Filed",
"CONTACT_REASON" = "Contact_Reason",
"IN_CUSTODY_REASON" = "In_Custody_Reason",
"NUM_INVOLVED_CIVILIANS" = "Num_Involved_Civilians",
"NUM_INVOLVED_OFFICERS" = "Num_Involved_Officers")
Incident_2016 <- Incident_2016 %>%
mutate(INCIDENT_TIME_STR = as.hms(Incident_2016$INCIDENT_TIME_STR))
## Warning: as.hms() is deprecated, please use as_hms().
## This warning is displayed once per session.
After cleaning up the 2016 dataset, I used the row_bind function to combine the datasets into a single tibble.
Incident_data <- Incident_2018 %>%
bind_rows(Incident_2017)
Incident_data <- Incident_data %>%
bind_rows(Incident_2016)
Now that the incident data is combined, let’s explore!
The first thing that I’m interested in is the total number of police use of force cases that resulted in serious bodily injury or death.
To begin, I need to designate the year in the mmddyyy values of the INCIDENT_DATE_STR variable.
Incident_data <- Incident_data %>%
mutate(INCIDENT_DATE_STR = mdy(INCIDENT_DATE_STR))
Incident_data <- Incident_data %>%
tibble(date = INCIDENT_DATE_STR,
year = year(INCIDENT_DATE_STR),
month = month(INCIDENT_DATE_STR),
day = day(INCIDENT_DATE_STR))
Since every case represents a single incident, I can use the following function.
Incident_data %>%
count(year)
## # A tibble: 3 x 2
## year n
## <dbl> <int>
## 1 2016 782
## 2 2017 724
## 3 2018 630
ggplot(Incident_data) +
geom_bar(mapping = aes(x = year), fill = "blue")
The data shows that there has been a gradual decline in the number of police use of force incidents that result in serious bodily injury or death across the state of California.
The California DOJ also notes that 2016 was the first year that agencies reported these data, and since not every law enforcement agency had the capacity to report these data, the 2016 dataset was likely less complete than latter years. This suggests that the incident count for 2016 was likely higher than what we see in the above graph, which suggests that these specific use of force incidents in California have seen an even greater reduction over the last few years.
But while these incidents have decreased statewide, this may not be true across all counties in the state. That is, this statewide decline in police use of force cases that result in serious bodily injury or death may mask increases in incidents in specific jurisdictions.
Given this, I segmented the data by individual counties to explore whether specific counties show reverse trends.
To do this, I first cleaned up the values in the COUNTY variable by using the recode function.
Incident_data <- Incident_data %>%
mutate(COUNTY = recode(COUNTY,
"ALAMEDA" = "Alameda County",
"BUTTE" = "Butte County",
"CALAVERAS" = "Calaveras County",
"CONTRA COSTA" = "Contra Costa County",
"DEL NORTE" = "Del Norte County",
"EL DORADO" = "El Dorado County",
"FRESNO" = "Fresno County",
"GLENN" = "Glenn County",
"HUMBOLDT" = "Humboldt County",
"IMPERIAL" = "Imperial County",
"KERN" = "Kern County",
"KINGS" = "Kings County",
"LAKE" = "Lake County",
"LASSEN" = "Lassen County",
"LOS ANGELES" = "Los Angeles County",
"MADERA" = "Madera County",
"MARIN" = "Marin County",
"MARIPOSA" = "Mariposa County",
"MENDOCINO" = "Mendocino County",
"MERCED" = "Merced County",
"MONTEREY" = "Monterey County",
"NAPA" = "Napa County",
"NEVADA" = "Nevada County",
"ORANGE" = "Orange County",
"PLACER" = "Placer County",
"PLUMAS" = "Plumas County",
"RIVERSIDE" = "Riverside County",
"SACRAMENTO" = "Sacramento County",
"SAN BERNARDINO" = "San Bernardino County",
"SAN DIEGO" = "San Diego County",
"SAN FRANCISCO" = "San Francisco County",
"SAN JOAQUIN" = "San Joaquin County",
"SAN LUIS OBISPO" = "San Luis Obispo County",
"SAN MATEO" = "San Mateo County",
"SANTA BARBARA" = "Santa Barbara County",
"SANTA CLARA" = "Santa Clara County",
"SANTA CRUZ" = "Santa Cruz County",
"SHASTA" = "Shasta County",
"SISKIYOU" = "Siskiyou County",
"SOLANO" = "Solano County",
"SONOMA" = "Sonoma County",
"STANISLAUS" = "Stanislaus County",
"SUTTER" = "Sutter County",
"TEHAMA" = "Tehama County",
"TRINITY" = "Trinity County",
"TULARE" = "Tulare County",
"TUOLUMNE" = "Tuolumne County",
"VENTURA" = "Ventura County",
"YOLO" = "Yolo County",
"YUBA" = "Yuba County"))
I then summarized the data with a table and graph.
Incident_data %>%
group_by(year, COUNTY) %>%
summarize(n = n()) %>%
spread(year, n) %>%
kable()
| COUNTY | 2016 | 2017 | 2018 |
|---|---|---|---|
| Alameda County | 20 | 38 | 26 |
| Butte County | 4 | 3 | 4 |
| Calaveras County | 2 | 1 | 1 |
| Contra Costa County | 27 | 23 | 20 |
| Del Norte County | 1 | 2 | 2 |
| El Dorado County | 1 | NA | 1 |
| Fresno County | 26 | 13 | 13 |
| Glenn County | 2 | 3 | 1 |
| Humboldt County | 6 | 4 | 2 |
| Imperial County | 2 | NA | 1 |
| Inyo County | 1 | NA | NA |
| Kern County | 36 | 39 | 24 |
| Kings County | 3 | 4 | 2 |
| Lake County | 1 | 2 | 3 |
| Lassen County | 1 | 1 | NA |
| Los Angeles County | 212 | 196 | 174 |
| Madera County | 3 | 3 | 3 |
| Marin County | NA | 1 | 1 |
| Mariposa County | 1 | 1 | NA |
| Mendocino County | 2 | 2 | 4 |
| Merced County | 2 | 7 | 4 |
| Modoc County | 2 | NA | NA |
| Monterey County | 2 | 5 | 1 |
| Napa County | 4 | 3 | 3 |
| Nevada County | 1 | NA | 1 |
| Orange County | 61 | 31 | 41 |
| Placer County | 6 | 7 | 7 |
| Plumas County | 1 | 1 | NA |
| Riverside County | 60 | 40 | 45 |
| Sacramento County | 20 | 27 | 18 |
| San Benito County | 1 | NA | NA |
| San Bernardino County | 71 | 62 | 70 |
| San Diego County | 49 | 67 | 48 |
| San Francisco County | 6 | 8 | 12 |
| San Joaquin County | 23 | 21 | 7 |
| San Luis Obispo County | 2 | 3 | 1 |
| San Mateo County | 15 | 12 | 7 |
| Santa Barbara County | 15 | 8 | 9 |
| Santa Clara County | 24 | 26 | 20 |
| Santa Cruz County | 4 | 3 | 2 |
| Shasta County | 3 | 3 | 5 |
| Siskiyou County | 2 | 1 | 2 |
| Solano County | 9 | 13 | 8 |
| Sonoma County | 6 | 8 | 4 |
| Stanislaus County | 11 | 4 | 5 |
| Sutter County | 2 | 3 | 1 |
| Tehama County | 2 | 4 | 8 |
| Trinity County | NA | 1 | NA |
| Tulare County | 14 | 8 | 8 |
| Tuolumne County | NA | 1 | NA |
| Ventura County | 7 | 4 | 8 |
| Yolo County | 1 | 2 | 1 |
| Yuba County | 5 | 5 | 2 |
ggplot(Incident_data) +
geom_bar(mapping = aes(x = year)) +
facet_wrap(~ COUNTY)
From a visual observation, it seems as if the declining trend in use of force incidents that result in serious bodily injury or death has decreased in the majority of counties in California. However, there are 58 counties in California, which makes the county breakdown a bit difficult to see. To make it easier, I’ll focus on only the top 15 counties in the state. There are several counties with less than a handful of incidents, and the top 15 counties make up about 85 percent of all incidents in the state. Therefore, trends in these jurisdictions will have the greatest impact on state-wide trends.
county_sort <- Incident_data %>%
count(COUNTY, sort = TRUE) %>%
head(15)
county_sort <- Incident_data %>%
filter(COUNTY %in% county_sort$COUNTY)
ggplot(county_sort) +
geom_bar(mapping = aes(x = year)) +
facet_wrap(~ COUNTY)
In the majority of the top 15 counties in California, the declining trend in police use of force incidents that result in serious bodily injury or death holds. While some counties, such as Alameda County and San Diego County, showed a slight uptick from 2016 to 2017, they each showed demonstrable declining trends from 2017 to 2018. Other counties, such as Orange County, Riverside County and San Bernardino County, show a slight uptick in incidents from 2017 to 2018, but overall they remain quite steady across all three years.
Let’s go ahead and visually represent these use of force cases on a map of California.
To begin, let’s use map_data to subset California counties.
counties <- map_data("county")
ca_counties <- counties %>%
filter(region == "california")
Next, I want to make a vector map of California counties. This vector map will be used to plot the county data into their respective counties.
ca_base <- ggplot(data = ca_counties, mapping = aes(x = long, y = lat, group = group)) +
coord_quickmap() +
geom_polygon(color = "black", fill = "grey")
ca_base
I’m not a fan of the way this looks. So I’ll use theme_void() to remove the access grids and grey background.
ca_base <- ca_base + theme_void() +
geom_polygon(data = ca_counties, fill = NA, color = "white") +
geom_polygon(color = "white", fill = NA)
ca_base
Before I join the Incident_data with the ca_base, I want to (1) create a tibble with the county values for all California counties, (2) recode the county names to match those in the ca_base data and (3) ensure the column names are lowercase.
forcebycounty <- Incident_data %>%
count(COUNTY)
forcebycounty <- forcebycounty %>%
mutate(COUNTY = recode(COUNTY,
"Alameda County" = "alameda",
"Butte County" = "butte",
"Calaveras County" = "calaveras",
"Contra Costa County" = "contra costa",
"Del Norte County" = "del norte",
"El Dorado County" = "el dorado",
"Fresno County" = "fresno",
"Glenn County" = "glenn",
"Humboldt County" = "humboldt",
"Imperial County" = "imperial",
"Kern County" = "kern",
"Kings County" = "kings",
"Lake County" = "lake",
"Lassen County" = "lassen",
"Los Angeles County" = "los angeles",
"Madera County" = "madera",
"Marin County" = "marin",
"Mariposa County" = "mariposa",
"Mendocino County" = "mendocino",
"Merced County" = "merced",
"Monterey County" = "monterey",
"Napa County" = "napa",
"Nevada County" = "nevada",
"Orange County" = "orange",
"Placer County" = "placer",
"Plumas County" = "plumas",
"Riverside County" = "riverside",
"Sacramento County" = "sacramento",
"San Bernardino County" = "san bernardino",
"San Diego County" = "san diego",
"San Francisco County" = "san francisco",
"San Joaquin County" = "san joaquin",
"San Luis Obispo County" = "san luis obispo",
"San Mateo County" = "san mateo",
"Santa Barbara County" = "santa barbara",
"Santa Clara County" = "santa clara",
"Santa Cruz County" = "santa cruz",
"Shasta County" = "shasta",
"Siskiyou County" = "siskiyou",
"Solano County" = "solano",
"Sonoma County" = "sonoma",
"Stanislaus County" = "stanislaus",
"Sutter County" = "sutter",
"Tehama County" = "tehama",
"Trinity County" = "trinity",
"Tulare County" = "tulare",
"Tuolumne County" = "tuolumne",
"Ventura County" = "ventura",
"Yolo County" = "yolo",
"Yuba County" = "yuba"))
forcebycounty <- mutate_each(forcebycounty, funs(tolower))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
forcebycounty
## # A tibble: 53 x 2
## COUNTY n
## <chr> <chr>
## 1 alameda 84
## 2 butte 11
## 3 calaveras 4
## 4 contra costa 70
## 5 del norte 5
## 6 el dorado 2
## 7 fresno 52
## 8 glenn 6
## 9 humboldt 12
## 10 imperial 3
## # … with 43 more rows
Now, it’s time to join the two datasets using dplyr’s left_join function.
caforce <- left_join(ca_counties, forcebycounty, by = c("subregion" = "COUNTY"))
caforce <- tibble(caforce)
caforce <- caforce %>%
mutate(n = as.numeric(n))
caforce
## # A tibble: 2,977 x 7
## long lat group order region subregion n
## <dbl> <dbl> <dbl> <int> <chr> <chr> <dbl>
## 1 -121. 37.5 157 6965 california alameda 84
## 2 -122. 37.5 157 6966 california alameda 84
## 3 -122. 37.5 157 6967 california alameda 84
## 4 -122. 37.5 157 6968 california alameda 84
## 5 -122. 37.5 157 6969 california alameda 84
## 6 -122. 37.5 157 6970 california alameda 84
## 7 -122. 37.5 157 6971 california alameda 84
## 8 -122. 37.5 157 6972 california alameda 84
## 9 -122. 37.5 157 6973 california alameda 84
## 10 -122. 37.5 157 6974 california alameda 84
## # … with 2,967 more rows
Map the data
hotforce <- ca_base +
geom_polygon(data = caforce, aes(fill = n), color = "white") +
geom_polygon(color = "black", fill = NA) +
theme_void()
hotforce
We may want to adjust the color settings to make the graphic more accessible for individuals who suffer from color blindness. We can do this with the viridis package.
hotforce <- hotforce +
scale_fill_viridis(breaks = c(100, 200, 300, 400, 500))
hotforce
We can also make this graph interactive using the plotly package.
ggplotly(hotforce)